### Rachel Porter
### 12/1/2022
### Replication select appendix figures and tables 

# loading libraries 
library("stargazer")
library("quanteda")
library("ggplot2")
library("stm")

# set working directory
setwd("~/Dropbox/Primary_Elections/Issue Paper/Replication_Archive/JOP_Replication_Files")

rm(list=ls())
#################### 
#### TABLE A1 
#################### 

# load data for missingness analysis
original <- readRDS("missingness_data.rds")

# run model 
model <- glm(truly_missing ~ contested_2 + as.factor(quality_cand) + 
               ss_party + money, data = original, family = binomial())
stargazer(model)

rm(list=ls())
#################### GENDER STM ####################

# load data for stm
original <- readRDS("gender_text.rds")

# converting text to corpus, tokenizing, converting to dfm, stm
iss <- corpus(original, text_field = "text")
data_tokens <- tokens(iss,
                      remove_numbers = TRUE, 
                      remove_punct = TRUE, 
                      remove_symbols = TRUE,
                      remove_separators = TRUE,
                      remove_url = TRUE) %>%
  tokens_tolower() %>%
  tokens_select(min_nchar = 2) %>%
  tokens_remove(c(stopwords("english")))

data_dfm <- dfm(data_tokens) %>%
  dfm_wordstem()

iss_stm <- convert(data_dfm, to = 'stm')

# determining drop threshold 
iss_stm <- prepDocuments(iss_stm$documents, iss_stm$vocab,
                         iss_stm$meta, lower.thresh = 25)

# running STM 
fit_stm <- stm(documents = iss_stm$documents, vocab = iss_stm$vocab,
               K = 7, prevalence = ~ gender,
               seed = 43894, data = iss_stm$meta, 
               init.type = "Spectral", max.em.its = 150)

#################### 
#### COMPONENTS FOR APPENDIX TABLE A4
#################### 
theta = fit_stm$theta
colMeans(theta)
labelTopics(fit_stm, n = 20)

# estimating differences in topic prevalanece 
iss_stm$meta$gender <- as.factor(iss_stm$meta$gender)
prep <- estimateEffect(1:7 ~ gender, fit_stm, 
                meta = iss_stm$meta, uncertainty = "None")

# extracting plot elements for ggplot
p = plot(prep, covariate="gender",
               model=fit_stm, method="difference", 
               cov.value1="0", cov.value2="1", ci.level = .90, 
               xlim=c(-.15,.15))

df <- data.frame(unlist(p$means))
cis = as.data.frame(do.call(rbind, p$cis))

df= cbind(df, cis)
colnames(df) = c('est', 'lo', 'hi')
df$topic = c('Broad Equality', 'Action & Advocate', 'Violence', 
              'Healthcare', 'Repro. Rights', 'Equal Pay', 'Childcare')

#################### 
#### APPENDIX FIGURE 2
#################### 
ggplot(df, aes(x = topic, y = est)) +
              geom_point(aes(size = 1)) +
              ylim(-0.1,.1) +
              geom_errorbar(aes(ymin=lo, ymax=hi), width=.1) +
              theme_bw() +
              xlab("") + ylab("\nMean Diff. between Female and Male Candidates") +
              theme(text = element_text(size=25))+
              theme(legend.position="none")+
              coord_flip()+
              geom_hline(yintercept=c(0), linetype="dashed", size = 1)

rm(list=ls())
#################### RACE STM ####################

# load data for stm
original <- readRDS("race_text.rds")

# converting text to corpus, tokenizing, converting to dfm, stm
iss <- corpus(original, text_field = "text")
data_tokens <- tokens(iss,
                      remove_numbers = TRUE, 
                      remove_punct = TRUE, 
                      remove_symbols = TRUE,
                      remove_separators = TRUE,
                      remove_url = TRUE) %>%
  tokens_tolower() %>%
  tokens_select(min_nchar = 2) %>%
  tokens_remove(c(stopwords("english")))

data_dfm <- dfm(data_tokens) %>%
  dfm_wordstem()

iss_stm <- convert(data_dfm, to = 'stm')

# Determining drop threshold 
iss_stm <- prepDocuments(iss_stm$documents, iss_stm$vocab,
               iss_stm$meta, lower.thresh = 25)

# running STM
fit_stm <- stm(documents = iss_stm$documents, vocab = iss_stm$vocab,
               K = 6, prevalence = ~ black,
               seed = 43894, data = iss_stm$meta, init.type = "Spectral", 
               max.em.its = 150)

#################### 
#### COMPONENTS FOR APPENDIX TABLE A5
#################### 
theta = fit_stm$theta
colMeans(theta)
labelTopics(fit_stm, n = 20)

# estimating differences in topic prevalanece 
iss_stm$meta$black <- as.factor(iss_stm$meta$black)
prep <- estimateEffect(1:6 ~ black, fit_stm, 
                meta = iss_stm$meta, uncertainty = "None")

# extracting plot elements for ggplot
p = plot(prep, covariate="black",
         model=fit_stm, method="difference", 
         cov.value1="0", cov.value2="1", ci.level = .90, 
         xlim=c(-.15,.15))

df <- data.frame(unlist(p$means))
cis = as.data.frame(do.call(rbind, p$cis))

df= cbind(df, cis)
colnames(df) = c('est', 'lo', 'hi')
df$topic = c("Broad Equality", "Voter Suppression", "War on Drugs", "Criminal Justice", "Racial Inequality", "Law Enforcement")

#################### 
#### APPENDIX FIGURE 3
#################### 
ggplot(df, aes(x = topic, y = est)) +
  geom_point(aes(size = 1)) +
  ylim(-0.15,.15) +
  geom_errorbar(aes(ymin=lo, ymax=hi), width=.1) +
  theme_bw() +
  xlab("") + ylab("\nMean Diff. between Black and White Candidates ") +
  theme(text = element_text(size=25))+
  theme(legend.position="none")+
  coord_flip()+
  geom_hline(yintercept=c(0), linetype="dashed", size = 1)

rm(list=ls())
#################### 
#### APPENDIX TABLE A10
#################### 
# load data for male/female democrat analysis
master <- readRDS("male_democrats_data.rds")

# subset data into strategic and not for balancing 
strategic <- subset(master, master$strategic == "Strategic" & 
                    master$women_number > 0)

# running left column model for appendix Table A10
fit.strat1 <- glm(narrow_female ~ quality_cand + openrace + dem_pres_av + 
                    primary_type + year + minority + as.factor(Value) + 
                    strategic_female + candnumber, data = strategic, 
                    family=binomial())

# load data for male/female democrat analysis
master <- readRDS("white_democrats_data.rds")

# subset data into strategic and not for balancing 
strategic <- subset(master, master$strategic == "Strategic" & 
                    master$black_number > 0)

fit.strat2 <- glm(narrow_race ~ quality_cand + openrace + dem_pres_av + 
                    primary_type + gf + gender + black_alone + south + 
                    candnumber + strategic_black + Value + candnumber, 
                    data = strategic, family=binomial())

stargazer(fit.strat1, fit.strat2,
                    star.char = c("*", "*"),
                    star.cutoffs = c(0.05, 0.01))
